From a6f71b01535affd1d4e8367eee4dfb0c39689cb4 Mon Sep 17 00:00:00 2001 From: Jan Heller Date: Thu, 12 Jun 2008 17:10:55 +0000 Subject: [PATCH] Added fallback non-SSE routine if the input buffer is not aligned to 16 2008-06-12 Jan Heller * extensions/sse-fixups.c (conv_rgbaF_linear_rgb8_linear), (conv_rgbaF_linear_rgba8_linear): Added fallback non-SSE routine if the input buffer is not aligned to 16 bytes. svn path=/trunk/; revision=324 --- ChangeLog | 6 ++ extensions/sse-fixups.c | 131 ++++++++++++++++++++++++++++------------ 2 files changed, 97 insertions(+), 40 deletions(-) diff --git a/ChangeLog b/ChangeLog index 45c4f95..caf521e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2008-06-12 Jan Heller + + * extensions/sse-fixups.c (conv_rgbaF_linear_rgb8_linear), + (conv_rgbaF_linear_rgba8_linear): Added fallback non-SSE routine + if the input buffer is not aligned to 16 bytes. + 2008-06-05 Jan Heller * AUTHORS: Updated contact info. diff --git a/extensions/sse-fixups.c b/extensions/sse-fixups.c index f893923..1cf90ab 100644 --- a/extensions/sse-fixups.c +++ b/extensions/sse-fixups.c @@ -25,6 +25,7 @@ #include "babl.h" #include "babl-cpuaccel.h" +#include "extensions/util.h" #define INLINE inline @@ -49,28 +50,53 @@ conv_rgbaF_linear_rgb8_linear (unsigned char *src, long samples) { long n = samples; - g4float *g4src = (g4float *) src; - g4float v; - union { - g2int si; - unsigned char c[8]; - } u; - - while (n--) + if ((int) src & 0xF) { - v = *g4src++ * g4float_ff; - v = g4float_min(v, g4float_ff); - v = g4float_max(v, g4float_zero); - u.si = g4float_cvt2pi (v); - *dst++ = u.c[0]; - *dst++ = u.c[4]; - v = g4float_movhl (v, v); - u.si = g4float_cvt2pi (v); - *dst++ = u.c[0]; + // nonaligned buffers, we have to use fallback x87 code + float *fsrc = (float *) src; + int v; + + while (n--) + { + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + fsrc++; + } + } + else + { + // all is well, buffers are SSE compatible + g4float *g4src = (g4float *) src; + g4float v; + + union { + g2int si; + unsigned char c[8]; + } u; + + while (n--) + { + v = *g4src++ * g4float_ff; + v = g4float_min(v, g4float_ff); + v = g4float_max(v, g4float_zero); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + v = g4float_movhl (v, v); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + } + + g4float_emms (); } - - g4float_emms (); return samples; } @@ -82,29 +108,54 @@ conv_rgbaF_linear_rgba8_linear (unsigned char *src, long samples) { long n = samples; - g4float *g4src = (g4float *) src; - g4float v; - - union { - g2int si; - unsigned char c[8]; - } u; - - while (n--) + if ((int) src & 0xF) { - v = *g4src++ * g4float_ff; - v = g4float_min(v, g4float_ff); - v = g4float_max(v, g4float_zero); - u.si = g4float_cvt2pi (v); - *dst++ = u.c[0]; - *dst++ = u.c[4]; - v = g4float_movhl (v, v); - u.si = g4float_cvt2pi (v); - *dst++ = u.c[0]; - *dst++ = u.c[4]; + // nonaligned buffers, we have to use fallback x87 code + float *fsrc = (float *) src; + int v; + + while (n--) + { + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + + v = rint (*fsrc++ * 255.0); + *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v); + } + } + else + { + // all is well, buffers are SSE compatible + g4float *g4src = (g4float *) src; + g4float v; + + union { + g2int si; + unsigned char c[8]; + } u; + + while (n--) + { + v = *g4src++ * g4float_ff; + v = g4float_min(v, g4float_ff); + v = g4float_max(v, g4float_zero); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + v = g4float_movhl (v, v); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + } + + g4float_emms (); } - - g4float_emms (); return samples; } -- 2.30.2